1 이항 회귀모형

1.1 데이터

library(tidyverse)
library(rvest)

lr_data_url <- "https://en.wikipedia.org/wiki/Logistic_regression"

lr_data_raw <- read_html(x = lr_data_url) %>% 
  html_elements(".wikitable") %>% 
  html_table() %>% 
  .[[1]] 

lr_tbl <- lr_data_raw %>% 
  janitor::clean_names() %>% 
  pivot_longer(-x1) %>% 
  mutate(구분 = ifelse(str_detect(x1, "Hours"), "학습시간", "입학여부")) %>% 
  select(name, 구분, value) %>% 
  pivot_wider(names_from = 구분, values_from = value)  %>% 
  select(학습시간, 입학여부)

# fs::dir_create("data")

lr_tbl %>% 
  write_rds("data/lr_tbl.rds")
lr_tbl <-read_rds("data/lr_tbl.rds")

lr_tbl %>% 
  reactable::reactable()

1.2 요약 통계량

lr_tbl %>% 
  skimr::skim()
Data summary
Name Piped data
Number of rows 20
Number of columns 2
_______________________
Column type frequency:
numeric 2
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
학습시간 0 1 2.79 1.51 0.5 1.69 2.62 4.06 5.5 ▇▇▆▅▅
입학여부 0 1 0.50 0.51 0.0 0.00 0.50 1.00 1.0 ▇▁▁▁▇

1.3 시각화

lr_tbl %>% 
  ggplot(aes(x = 학습시간, y = 입학여부)) +
    geom_point() +
    geom_smooth(method = "glm", 
      method.args = list(family = "binomial"), 
      se = FALSE) +
      labs(title = "학습시간과 입학확률",
           x = "학습시간",
           y = "합격확률 (%)") +
      theme_light() +
      scale_y_continuous(labels = scales::percent)

1.4 내장 모형 활용

adm_lr <- glm(입학여부 ~ 학습시간, family = "binomial", data = lr_tbl)

adm_lr

Call:  glm(formula = 입학여부 ~ 학습시간, family = "binomial", 
    data = lr_tbl)

Coefficients:
(Intercept)     학습시간  
     -4.078        1.505  

Degrees of Freedom: 19 Total (i.e. Null);  18 Residual
Null Deviance:      27.73 
Residual Deviance: 16.06    AIC: 20.06

1.5 합격 예측 시각화

library(crosstalk)

crosstalk_lr_raw <- tibble( 학습시간 = seq(from = 1, to = 5, 0.1) )

crosstalk_lr_tbl <- crosstalk_lr_raw %>% 
  mutate(합격확률 = predict(adm_lr, newdata = crosstalk_lr_raw, type = "response" )) %>% 
  left_join(lr_tbl) %>% 
  mutate(입학여부 = factor(입학여부, levels = c(0, 1), labels = c("불합격", "합격")) )

crosstalk_lr_g <- crosstalk_lr_tbl %>% 
    ggplot(aes(x = 학습시간, y = 합격확률) ) +
      geom_point() +
      geom_point(aes(x = 학습시간, y = as.numeric(입학여부) - 1, color = 입학여부 ) ) +
      geom_smooth(method = "glm", method.args = list(family = "binomial"),
      se = FALSE) +
      labs(title = "학습시간과 입학확률",
           x = "학습시간",
           y = "합격확률 (%)") +
      theme_light() +
      scale_y_continuous(labels = scales::percent)

plotly::ggplotly(crosstalk_lr_g )

1.6 직접 구현

최우추정량(MLE)을 찾는 것은 - 우도(Likelihood)값을 구하는 것과 동일하기 General-purpose optimization 에 함수를 정의해서 모수 초기화하여 함께 넣어 반복적으로 근사시켜 모수를 계산한다.

\[ NLL(y) = -{\log(p(y))} \]

\[ \min_{\theta} \sum_y {-\log(p(y;\theta))} \]

\[ \max_{\theta} \prod_y p(y;\theta) \]

sigmoid_fn <-  function(x){
  
  1/(1+exp(-x))
  
}

neg_log_likelihood <- function(par, data, y, include_alpha = T) {
  
  # 출력변수 정의
  x <- data[,names(data) != y]
  y_data <- data[,y]

  # 1. 선형결합
  if( include_alpha ){

    # 선형결합: beta_1 * x_1 + beta_2 * x_2 + ...
    linear_combination <- mapply("*", x, par[2:length(par)])

    # 알파(편향) 계수 결합 : alpha + beta_1 * x_1 + beta_2 * x_2 + ...
    theta <-  rowSums(linear_combination) + par[1]
  } else {
    theta <- rowSums(mapply("*", x, par))
  }

  # 2. 확률 계산
  p <- sigmoid_fn(theta)
  # p <- exp(theta) / (1 + exp(theta))

  # 3. 우도값 계산: -log likelihood 
  value <- - sum(y_data * log(p) + (1-y_data)*log(1-p)) 

  return(value)
}

library(optimx)

lr_opt <- optimx(
  par = c(0,0),
  fn = neg_log_likelihood,
  data = lr_tbl,
  y = "입학여부",
   method = "Nelder-Mead",
  include_alpha = TRUE,
  itnmax = 100,
  control = list(trace = TRUE, all.methods = FALSE)
)
fn is  fn 
Looking for method =  Nelder-Mead 
Function has  2  arguments
Analytic gradient not made available.
Analytic Hessian not made available.
Scale check -- log parameter ratio= -Inf   log bounds ratio= NA 
Method:  Nelder-Mead 
  Nelder-Mead direct search function minimizer
function value for initial parameters = 13.862944
  Scaled convergence tolerance is 2.06574e-07
Stepsize computed as 0.100000
BUILD              3 13.887933 13.096825
EXTENSION          5 13.862944 12.582216
EXTENSION          7 13.096825 12.067907
EXTENSION          9 12.582216 11.285614
LO-REDUCTION      11 12.067907 11.285614
LO-REDUCTION      13 11.874408 11.285614
EXTENSION         15 11.469089 10.348151
LO-REDUCTION      17 11.285614 10.348151
EXTENSION         19 10.370274 8.914547
LO-REDUCTION      21 10.348151 8.914547
EXTENSION         23 9.266657 8.226456
REFLECTION        25 8.914547 8.030679
LO-REDUCTION      27 8.259889 8.030679
LO-REDUCTION      29 8.226456 8.030679
LO-REDUCTION      31 8.114076 8.030679
HI-REDUCTION      33 8.053435 8.030679
HI-REDUCTION      35 8.052924 8.030679
LO-REDUCTION      37 8.038012 8.030679
HI-REDUCTION      39 8.033349 8.030679
LO-REDUCTION      41 8.031439 8.030144
HI-REDUCTION      43 8.030679 8.030087
HI-REDUCTION      45 8.030144 8.029950
HI-REDUCTION      47 8.030087 8.029938
HI-REDUCTION      49 8.029950 8.029917
HI-REDUCTION      51 8.029938 8.029881
LO-REDUCTION      53 8.029917 8.029881
HI-REDUCTION      55 8.029890 8.029881
LO-REDUCTION      57 8.029889 8.029880
HI-REDUCTION      59 8.029881 8.029880
HI-REDUCTION      61 8.029880 8.029879
HI-REDUCTION      63 8.029880 8.029879
REFLECTION        65 8.029879 8.029879
Exiting from Nelder Mead minimizer
    67 function evaluations used
Post processing for method  Nelder-Mead 
Successful convergence! 
Compute Hessian approximation at finish of  Nelder-Mead 
Compute gradient approximation at finish of  Nelder-Mead 
Save results from method  Nelder-Mead 
$par
[1] -4.076953  1.504453

$value
[1] 8.029879

$message
NULL

$convcode
[1] 0

$fevals
function 
      67 

$gevals
gradient 
      NA 

$nitns
[1] NA

$kkt1
[1] TRUE

$kkt2
[1] TRUE

$xtimes
user.self 
     0.08 

Assemble the answers
lr_opt[, 1:5] %>% 
  as.data.frame() %>% 
  rownames_to_column("method") %>% 
  filter(method == "Nelder-Mead") %>% 
  select(method, p1, p2)
       method        p1       p2
1 Nelder-Mead -4.076953 1.504453

glm() 함수로 구현한 것과 값이 동일한지 상호확인한다.

adm_lr$coefficients
(Intercept)    학습시간 
  -4.077713    1.504645 

1.7 예측값 생성

predict_fn <- function(x, par){

  if( ncol(x) < length(par) ){
    theta <- rowSums(mapply("*", x, par[2:length(par)])) +  as.numeric(par[1])
  } else {
    theta <- rowSums(mapply("*", x, par))
  }

  prob <- sigmoid_fn(theta)

  return(prob)
}

custom_pred <- predict_fn(lr_tbl %>% select(학습시간),  lr_opt[, 1:2])

lr_tbl  %>% 
  mutate(자체제작_예측 = custom_pred) %>% 
  mutate(예측 = predict(adm_lr, newdata = lr_tbl %>% select(학습시간), type ="response")) %>% 
  reactable::reactable()
 

데이터 과학자 이광춘 저작

kwangchun.lee.7@gmail.com